This notebook performs Exploratory Data Analysis on the motor vehicle crashes data aggregated with the ACS data

LOAD DATA

df <- read_csv("../data/processed/acs_mvc_combined_2018_2023.csv")
glimpse(df)
## Rows: 13,984
## Columns: 50
## $ geoid                   <dbl> 3.6005e+10, 3.6005e+10, 3.6005e+10, 3.6005e+10…
## $ year                    <dbl> 2018, 2019, 2020, 2021, 2022, 2023, 2018, 2019…
## $ total_population        <dbl> 47900.000, 46630.000, 44881.000, 45523.000, 30…
## $ pct_male_population     <dbl> 13.576200, 13.607120, 13.691763, 13.568965, 13…
## $ pct_female_population   <dbl> 1.2045929, 1.1130174, 1.0137920, 1.0631988, 0.…
## $ pct_white_population    <dbl> 3.7014614, 4.7887626, 5.7039727, 5.8871340, 6.…
## $ pct_black_population    <dbl> 8.849687, 8.200729, 7.666941, 7.187576, 6.7581…
## $ pct_asian_population    <dbl> 0.26096033, 0.38816213, 0.38100755, 0.38881445…
## $ pct_hispanic_population <dbl> 4.862213, 5.161913, 5.158085, 4.758034, 3.8374…
## $ pct_foreign_born        <dbl> 2.206681, 2.365430, 2.319467, 2.220855, 1.9907…
## $ pct_age_under_18        <dbl> 0.3569937, 0.2208878, 0.2161271, 0.2152758, 0.…
## $ pct_age_18_34           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_age_35_64           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_age_65_plus         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ median_income           <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 5858…
## $ pct_income_under_25k    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_income_25k_75k      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_income_75k_plus     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_below_poverty       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_above_poverty       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ poverty_rate            <dbl> NA, NA, NA, NA, NA, NA, 22.633201, 22.440424, …
## $ median_gross_rent       <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000…
## $ pct_owner_occupied      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_renter_occupied     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_no_vehicle          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_one_vehicle         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_two_plus_vehicles   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ pct_less_than_hs        <dbl> 6.3924843, 6.0261634, 5.7017446, 5.6608747, 4.…
## $ pct_hs_diploma          <dbl> 2.0229645, 2.7364358, 3.0547448, 3.4619862, 3.…
## $ pct_some_college        <dbl> 1.2526096, 1.1237401, 1.0405294, 0.9797245, 1.…
## $ pct_associates_degree   <dbl> 0.1691023, 0.1737079, 0.2339520, 0.2723898, 0.…
## $ pct_bachelors_degree    <dbl> 0.14822547, 0.15440703, 0.18938972, 0.23724271…
## $ pct_graduate_degree     <dbl> 0.01670146, 0.01930088, 0.04233417, 0.05272060…
## $ pct_in_labor_force      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_employed            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_unemployed          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_not_in_labor_force  <dbl> 14.780793, 14.720137, 14.705555, 14.632164, 14…
## $ unemployment_rate       <dbl> NA, NA, NA, NA, NA, NA, 15.750133, 13.478747, …
## $ pct_commute_short       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_commute_medium      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_commute_long        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_drive_alone         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_carpool             <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ pct_public_transit      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_walk                <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ pct_bike                <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ pct_work_from_home      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ crash_rate_per_1000     <dbl> 0.04175365, 0.08578169, 0.00000000, 0.00000000…
## $ injury_rate_per_1000    <dbl> 0.02087683, 0.04289084, 0.00000000, 0.00000000…
## $ fatality_rate_per_1000  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…

CONVERT DATA TYPES

df <- df %>%
  mutate(
    geoid = as.character(geoid),
    year = as.integer(year),
    total_population = as.integer(total_population)
  )
glimpse(df)
## Rows: 13,984
## Columns: 50
## $ geoid                   <chr> "36005000100", "36005000100", "36005000100", "…
## $ year                    <int> 2018, 2019, 2020, 2021, 2022, 2023, 2018, 2019…
## $ total_population        <int> 47900, 46630, 44881, 45523, 30541, 24345, 5134…
## $ pct_male_population     <dbl> 13.576200, 13.607120, 13.691763, 13.568965, 13…
## $ pct_female_population   <dbl> 1.2045929, 1.1130174, 1.0137920, 1.0631988, 0.…
## $ pct_white_population    <dbl> 3.7014614, 4.7887626, 5.7039727, 5.8871340, 6.…
## $ pct_black_population    <dbl> 8.849687, 8.200729, 7.666941, 7.187576, 6.7581…
## $ pct_asian_population    <dbl> 0.26096033, 0.38816213, 0.38100755, 0.38881445…
## $ pct_hispanic_population <dbl> 4.862213, 5.161913, 5.158085, 4.758034, 3.8374…
## $ pct_foreign_born        <dbl> 2.206681, 2.365430, 2.319467, 2.220855, 1.9907…
## $ pct_age_under_18        <dbl> 0.3569937, 0.2208878, 0.2161271, 0.2152758, 0.…
## $ pct_age_18_34           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_age_35_64           <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_age_65_plus         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ median_income           <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 5858…
## $ pct_income_under_25k    <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_income_25k_75k      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_income_75k_plus     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_below_poverty       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_above_poverty       <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ poverty_rate            <dbl> NA, NA, NA, NA, NA, NA, 22.633201, 22.440424, …
## $ median_gross_rent       <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000…
## $ pct_owner_occupied      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_renter_occupied     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_no_vehicle          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_one_vehicle         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_two_plus_vehicles   <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ pct_less_than_hs        <dbl> 6.3924843, 6.0261634, 5.7017446, 5.6608747, 4.…
## $ pct_hs_diploma          <dbl> 2.0229645, 2.7364358, 3.0547448, 3.4619862, 3.…
## $ pct_some_college        <dbl> 1.2526096, 1.1237401, 1.0405294, 0.9797245, 1.…
## $ pct_associates_degree   <dbl> 0.1691023, 0.1737079, 0.2339520, 0.2723898, 0.…
## $ pct_bachelors_degree    <dbl> 0.14822547, 0.15440703, 0.18938972, 0.23724271…
## $ pct_graduate_degree     <dbl> 0.01670146, 0.01930088, 0.04233417, 0.05272060…
## $ pct_in_labor_force      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_employed            <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_unemployed          <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_not_in_labor_force  <dbl> 14.780793, 14.720137, 14.705555, 14.632164, 14…
## $ unemployment_rate       <dbl> NA, NA, NA, NA, NA, NA, 15.750133, 13.478747, …
## $ pct_commute_short       <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_commute_medium      <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.…
## $ pct_commute_long        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_drive_alone         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_carpool             <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ pct_public_transit      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.0000…
## $ pct_walk                <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ pct_bike                <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ pct_work_from_home      <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…
## $ crash_rate_per_1000     <dbl> 0.04175365, 0.08578169, 0.00000000, 0.00000000…
## $ injury_rate_per_1000    <dbl> 0.02087683, 0.04289084, 0.00000000, 0.00000000…
## $ fatality_rate_per_1000  <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000…

DATA QUALITY CHECKS

# Visualize missingness
vis_miss(df)

# Count NA values per column
colSums(is.na(df))
##                   geoid                    year        total_population 
##                       0                       0                       0 
##     pct_male_population   pct_female_population    pct_white_population 
##                     426                     426                     426 
##    pct_black_population    pct_asian_population pct_hispanic_population 
##                     426                     426                     426 
##        pct_foreign_born        pct_age_under_18           pct_age_18_34 
##                     426                     426                     426 
##           pct_age_35_64         pct_age_65_plus           median_income 
##                     426                     426                       0 
##    pct_income_under_25k      pct_income_25k_75k     pct_income_75k_plus 
##                     426                     426                     426 
##       pct_below_poverty       pct_above_poverty            poverty_rate 
##                     426                     426                     455 
##       median_gross_rent      pct_owner_occupied     pct_renter_occupied 
##                       0                     426                     426 
##          pct_no_vehicle         pct_one_vehicle   pct_two_plus_vehicles 
##                     426                     426                     426 
##        pct_less_than_hs          pct_hs_diploma        pct_some_college 
##                     426                     426                     426 
##   pct_associates_degree    pct_bachelors_degree     pct_graduate_degree 
##                     426                     426                     426 
##      pct_in_labor_force            pct_employed          pct_unemployed 
##                     426                     426                     426 
##  pct_not_in_labor_force       unemployment_rate       pct_commute_short 
##                     426                     464                     426 
##      pct_commute_medium        pct_commute_long         pct_drive_alone 
##                     426                     426                     426 
##             pct_carpool      pct_public_transit                pct_walk 
##                     426                     426                     426 
##                pct_bike      pct_work_from_home     crash_rate_per_1000 
##                     426                     426                     426 
##    injury_rate_per_1000  fatality_rate_per_1000 
##                     426                     426

Identify and remove 426 rows with extensive missingness

df$na_rate <- rowMeans(is.na(df))

# Filter out rows with high missingness
df <- df %>%
    filter(rowMeans(is.na(.)) <= 0.05)

# Drop 29 rows with poverty_rate NA and/or 38 unemployment_rate NA values
df <- df %>%
    drop_na(c(unemployment_rate, poverty_rate))

# Drop na_rate column
df <- df %>% select(-na_rate)

NO NA values! ## ADD PANDEMIC TEMPORAL FEATURE

df <- df %>%
        mutate(pct_vehicle = pct_one_vehicle + pct_two_plus_vehicles,
               post_pandemic = ifelse(year < 2020, "pre", "post")  # pre vs. post
               ) %>%
    mutate(post_pandemic = as.integer(post_pandemic == "post"))

ADD INTERACTION FEATURES

df <- df %>%
  mutate(
    poverty_vehicle_interaction = pct_below_poverty * pct_vehicle,
    unemployment_vehicle_interaction = unemployment_rate * pct_vehicle
  )

Winsorization of extreme values

replace_top_99_with_median <- function(x) {
  # Ensure numeric
  if (!is.numeric(x)) stop("Column must be numeric.")
  
  # Calculate the 99th percentile and median
  upper_threshold <- quantile(x, 0.99, na.rm = TRUE)
  median_val <- median(x, na.rm = TRUE)
  
  # Replace values above the 99th percentile with the median
  x[x > upper_threshold] <- median_val
  return(x)
}

per_1000_vars <- grep("per_1000|interaction", names(df), value = TRUE)
print(per_1000_vars)
## [1] "crash_rate_per_1000"              "injury_rate_per_1000"            
## [3] "fatality_rate_per_1000"           "poverty_vehicle_interaction"     
## [5] "unemployment_vehicle_interaction"
df[per_1000_vars] <- lapply(df[per_1000_vars], replace_top_99_with_median)

Univariate Analysis

GROUP CORRELATED SOCIO-ECONOMIC VARIABLES

numeric_vars <- sapply(df, is.numeric)
cor_matrix <- cor(df[ , numeric_vars], 
                  use = "pairwise.complete.obs")

# Identify columns to drop (to reduce multicollinearity)
high_corr <- caret::findCorrelation(cor_matrix, cutoff = 0.8)
colnames(cor_matrix)[high_corr]
## [1] "pct_income_75k_plus"   "poverty_rate"          "pct_below_poverty"    
## [4] "pct_employed"          "pct_no_vehicle"        "pct_drive_alone"      
## [7] "unemployment_rate"     "pct_female_population" "post_pandemic"
# Open a PNG device
png("../report/plots/pairwise_correlation_matrix.png", width = 1200, height = 1000, res = 300)

# Draw the plot
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.5)

# Close the device
dev.off()
## quartz_off_screen 
##                 2

DROP REDUNDANT VARIABLES

df <- df %>%
    select(-c(poverty_rate,
              pct_above_poverty,
              pct_employed,
              pct_unemployed,
              pct_drive_alone,
              pct_one_vehicle,
              pct_two_plus_vehicles,
              pct_female_population,
              pct_no_vehicle,
              pct_income_75k_plus)) 

ADD BOROUGHS

df <- df %>%
  mutate(
    borough = case_when(
      substr(geoid, 1, 5) == "36005" ~ "Bronx",
      substr(geoid, 1, 5) == "36047" ~ "Brooklyn",
      substr(geoid, 1, 5) == "36061" ~ "Manhattan",
      substr(geoid, 1, 5) == "36081" ~ "Queens",
      substr(geoid, 1, 5) == "36085" ~ "Staten Island",
      TRUE ~ "Unknown"
    )
  )

Export clean data

saveRDS(df, "../data/models/social-risk-crash-rate-data.rds")